In [ ]:
author: Diogo Silva
In [2]:
import numpy as np
import seaborn as snb
import matplotlib.pyplot as plt
In [3]:
%matplotlib inline
In [4]:
from MyML.cluster.K_Means3 import K_Means
import MyML.helper.partition as partMod
In [5]:
n_samples = 1e4
n_samples = int(n_samples)
n_features = 2
n_clusters = 6
In [7]:
data = np.random.uniform(size=(n_samples, n_features)).astype(np.float32)
plt.plot(data[:,0],data[:,1],'.')
Out[7]:
In [34]:
g1 = np.random.normal(loc=(0,0), scale=1, size=(n_samples/n_clusters, n_features))
g2 = np.random.normal(loc=(10,5), scale=0.5, size=(n_samples/n_clusters, n_features))
g3 = np.random.normal(loc=(0,8), scale=0.1, size=(n_samples/n_clusters, n_features))
g4 = np.random.normal(loc=(-10,5), scale=(0.25,1), size=(n_samples/n_clusters, n_features))
g5 = np.random.normal(loc=(5,10), scale=0.5, size=(n_samples/n_clusters, n_features))
g6 = np.random.normal(loc=(10,-5), scale=0.5, size=(n_samples/n_clusters, n_features))
data = np.vstack((g1,g2,g3,g4,g5,g6))
gt = np.empty(data.shape[0], dtype = np.int32)
for i in range(n_clusters):
gt[g1.shape[0] * i:g1.shape[0] * i + g1.shape[0]] = i
plt.plot(data[:,0],data[:,1],'.')
Out[34]:
In [35]:
np.unique(gt)
Out[35]:
In [36]:
foldername = "/home/diogoaos/QCThesis/datasets/gaussmix1e4/"
dataname = "data.csv"
gtname = "ground_truth.csv"
In [37]:
np.savetxt(foldername + dataname, data, delimiter = ",")
np.savetxt(foldername + gtname, gt, delimiter = ",")
In [38]:
data = np.genfromtxt(foldername + dataname, delimiter = ",", dtype = np.float32)
In [8]:
n_partitions = 100
ensemble_clusters = [np.sqrt(n_samples)/2, np.sqrt(n_samples)]
ensemble_clusters = map(int, ensemble_clusters)
generator = K_Means(cuda_mem="manual")
In [40]:
%time partMod.generateEnsembleToFiles(foldername, data, generator, n_clusters=ensemble_clusters, npartitions=n_partitions)
In [9]:
%time ensemble = partMod.generateEnsemble(data, generator, n_clusters=ensemble_clusters, npartitions=n_partitions)
In [17]:
import MyML.cluster.eac as eac
In [42]:
reload(eac)
Out[42]:
In [43]:
foldername = "/home/diogoaos/QCThesis/datasets/gaussmix1e4/"
In [44]:
ensemble = partMod.loadEnsembleFromFiles(foldername=foldername)
In [18]:
i=0
In [20]:
for clust in ensemble[i]:
plt.plot(data[clust,0], data[clust, 1], '.')
i+=1
if i >= len(ensemble):
i=0
In [194]:
n_samples = 0
for clust in ensemble[0]:
n_samples += clust.size
print n_samples
In [67]:
fullEAC = eac.EAC(n_samples = n_samples, mat_sparse = False)
In [68]:
%time fullEAC.fit(ensemble, assoc_mode="full", prot_mode="none")
Total time = 3.37s
In [69]:
full_nnz = fullEAC._coassoc.nonzero()[0].size
print "full matrix edges/vertices ratio : ", full_nnz * 1.0 / n_samples
In [161]:
coassoc = np.zeros((n_samples,n_samples))
%time eac.update_coassoc_with_ensemble(coassoc, ensemble)
Total time = 987 ms
In [163]:
(coassoc == fullEAC._coassoc).all()
Out[163]:
In [76]:
protEAC = eac.EAC(n_samples = n_samples, mat_sparse = False)
In [77]:
n_prots = int(0.1 * n_samples)
%time protEAC.fit(ensemble, assoc_mode="full", prot_mode="random", nprot=n_prots)
In [78]:
protEAC_nnz = protEAC._coassoc.nonzero()[0].size
print "prot matrix edges/vertices ratio : ", full_nnz * 1.0 / n_samples
Total time = 421 ms
In [79]:
sparseEAC = eac.EAC(n_samples = n_samples, mat_sparse = True)
In [80]:
%time sparseEAC.fit(ensemble, assoc_mode="full")
Total time = 13min 21s
Save full coassoc.
In [115]:
%time np.savetxt(foldername + "full_coassoc.csv", fullEAC._coassoc, fmt="%d", delimiter=",")
Save full coassocs in CSR format.
In [82]:
from scipy.sparse import csr_matrix
In [116]:
%time full_sp = csr_matrix(fullEAC._coassoc)
%time prot_sp = csr_matrix(protEAC._coassoc)
In [114]:
fullEAC._coassoc[np.diag_indices_from(fullEAC._coassoc)] = 0
protEAC._coassoc[np.diag_indices_from(protEAC._coassoc)] = 0
In [157]:
np.where(fullEAC._coassoc[0:1666,0:1666]==100)[0]
Out[157]:
In [491]:
print full_sp.__str__
print prot_sp.__str__
In [117]:
np.savetxt(foldername + "full_dest.csr", full_sp.indices, fmt="%d",delimiter=",")
np.savetxt(foldername + "full_weight.csr", full_sp.data, fmt="%d",delimiter=",")
np.savetxt(foldername + "full_fe.csr", full_sp.indptr, fmt="%d",delimiter=",")
In [118]:
np.savetxt(foldername + "prot_dest.csr", prot_sp.indices, fmt="%d",delimiter=",")
np.savetxt(foldername + "prot_weight.csr", prot_sp.data, fmt="%d",delimiter=",")
np.savetxt(foldername + "prot_fe.csr", prot_sp.indptr, fmt="%d",delimiter=",")
In [ ]:
del full_sp, prot_sp
In [21]:
reload(eac)
Out[21]:
In [51]:
fullEAC = eac.EAC(n_samples = n_samples, mat_sparse = False)
%time fullEAC.fit(ensemble, assoc_mode="full", prot_mode="none")
In [23]:
print fullEAC.getMaxAssocs()[0]
print fullEAC.getNNZAssocs()
In [53]:
resPD=pd.DataFrame(columns=["col1","col2","col3"])
In [52]:
thresholds = np.arange(0,1.01,0.05)
res = np.empty((thresholds.size, 3))
for i in range(thresholds.size):
res[i, 0] = thresholds[i]
fullEAC.apply_threshold(thresholds[i])
max_assocs, max_idx = fullEAC.getMaxAssocs()
res[i, 1] = max_assocs
nnz_pc = fullEAC.getNNZAssocs() / 8533572.0
res[i, 2] = nnz_pc
print thresholds[i], max_assocs, nnz_pc
In [25]:
import pandas as pd
In [30]:
resPD = pd.DataFrame(res, columns=["threshold","max_assocs", "nnz percent relative to max"])
In [33]:
print resPD.to_latex()
In [ ]:
resPD8
In [35]:
fullEAC._coassoc
Out[35]:
In [16]:
fullEAC._coassoc[::100,::100].shape
Out[16]:
In [18]:
plt.pcolor(fullEAC._coassoc[::50,::50])
Out[18]:
In [ ]:
plt.pcolor()
In [91]:
from numba import jit
@jit
def outdegree_from_firstedge(firstedge, outdegree, n_edges):
n_vertices = firstedge.size
for v in range(n_vertices - 1):
outdegree[v] = firstedge[v + 1] - firstedge[v]
outdegree[n_vertices - 1] = n_edges - firstedge[n_vertices - 1]
In [6]:
print foldername
In [119]:
%time dest = np.genfromtxt(foldername + "full_dest.csr", dtype = np.int32, delimiter=",")
%time weight = np.genfromtxt(foldername + "full_weight.csr", dtype = np.float32, delimiter=",")
%time fe = np.genfromtxt(foldername + "full_fe.csr", dtype = np.int32, delimiter=",")
In [86]:
dest = np.genfromtxt(foldername + "prot_dest.csr", dtype = np.int32, delimiter=",")
weight = np.genfromtxt(foldername + "prot_weight.csr", dtype = np.float32, delimiter=",")
fe = np.genfromtxt(foldername + "prot_fe.csr", dtype = np.int32, delimiter=",")
In [120]:
fe = fe[:-1]
In [121]:
od = np.empty_like(fe)
outdegree_from_firstedge(fe, od, dest.size)
In [126]:
weight = 100 - weight
In [127]:
print "# edges : ", dest.size
print "# vertices : ", fe.size
print "edges/vertices ratio : ", dest.size * 1.0 / fe.size
In [10]:
from numba import jit
@jit
def outdegree_from_firstedge(firstedge, outdegree, n_edges):
n_vertices = firstedge.size
for v in range(n_vertices - 1):
outdegree[v] = firstedge[v + 1] - firstedge[v]
outdegree[n_vertices - 1] = n_edges - firstedge[n_vertices - 1]
In [106]:
from numba import cuda
import MyML.graph.mst as myMST
import MyML.graph.build as graphBuild
import MyML.graph.connected_components as ccomps
import MyML.cluster.linkage as linkage
In [136]:
reload(linkage)
Out[136]:
In [142]:
%prun linkage.sl_mst_lifetime_gpu(dest, weight, fe, od, disconnect_weight = 100, MAX_TPB=512)
In [140]:
%time labels = linkage.sl_mst_lifetime_gpu(dest, weight, fe, od, disconnect_weight = 100, MAX_TPB=512)
In [149]:
import MyML.metrics.accuracy as accuracy
In [145]:
%time gt = np.genfromtxt(foldername + "ground_truth.csv", dtype = np.int32, delimiter=",")
In [151]:
scorer = accuracy.HungarianIndex(nsamples=gt.size)
%time scorer.score(gt, labels)
In [152]:
print scorer.accuracy